import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt #plot matplotlib
import seaborn as sns #plot seaborn
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
import os
print(os.listdir("C:/Users/Romansya/Home Credit Risk/input"))
app_test = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/app_test.csv")
app_train = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/app_train.csv")
installment_payment = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/installment_payment.csv")
prev_app = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/prev_app.csv")
print('Size of app_test data', app_test.shape)
print('Size of app_train data', app_train.shape)
print('Size of installment_payment data', installment_payment.shape)
print('Size of prev_app data', prev_app.shape)
app_test.head()
app_train.head()
installment_payment.head()
prev_app.head()
# check in app_test
total = app_test.isnull().sum().sort_values(ascending = False)
percent = (app_test.isnull().sum()/app_test.isnull().count()*100).sort_values(ascending = False)
missing_app_test_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_app_test_data.head()
# check in app_train
total = app_train.isnull().sum().sort_values(ascending = False)
percent = (app_train.isnull().sum()/app_train.isnull().count()*100).sort_values(ascending = False)
missing_app_train_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_app_train_data.head()
# check in installment_payment
total = installment_payment.isnull().sum().sort_values(ascending = False)
percent = (installment_payment.isnull().sum()/installment_payment.isnull().count()*100).sort_values(ascending = False)
missing_installment_payment_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_installment_payment_data.head()
# check in prev_app
total = prev_app.isnull().sum().sort_values(ascending = False)
percent = (prev_app.isnull().sum()/prev_app.isnull().count()*100).sort_values(ascending = False)
missing_prev_app_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_prev_app_data.head()
from plotly import tools
def bar_hor(df, col, title, color, w=None, h=None, lm=0, limit=100, return_trace=False, rev=False, xlb = False):
cnt_srs = df[col].value_counts()
yy = cnt_srs.head(limit).index[::-1]
xx = cnt_srs.head(limit).values[::-1]
if rev:
yy = cnt_srs.tail(limit).index[::-1]
xx = cnt_srs.tail(limit).values[::-1]
if xlb:
trace = go.Bar(y=xlb, x=xx, orientation = 'h', marker=dict(color=color))
else:
trace = go.Bar(y=yy, x=xx, orientation = 'h', marker=dict(color=color))
if return_trace:
return trace
layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
def bar_hor_noagg(x, y, title, color, w=None, h=None, lm=0, limit=100, rt=False):
trace = go.Bar(y=x, x=y, orientation = 'h', marker=dict(color=color))
if rt:
return trace
layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
def bar_ver_noagg(x, y, title, color, w=None, h=None, lm=0, rt = False):
trace = go.Bar(y=y, x=x, marker=dict(color=color))
if rt:
return trace
layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
def gp(col, title):
df1 = app_train[app_train["TARGET"] == 1]
df0 = app_train[app_train["TARGET"] == 0]
a1 = df1[col].value_counts()
b1 = df0[col].value_counts()
total = dict(app_train[col].value_counts())
x0 = a1.index
x1 = b1.index
y0 = [float(x)*100 / total[x0[i]] for i,x in enumerate(a1.values)]
y1 = [float(x)*100 / total[x1[i]] for i,x in enumerate(b1.values)]
trace1 = go.Bar(x=a1.index, y=y0, name='Target : 1', marker=dict(color="#44ff54"))
trace2 = go.Bar(x=b1.index, y=y1, name='Target : 0', marker=dict(color="#ff4444"))
return trace1, trace2
# Target Variable Distribution
bar_hor(app_train, "TARGET", "Distribution of Target Variable" , ["#44ff54", '#ff4444'], h=350, w=600, lm=200, xlb = ['Target : 1','Target : 0'])
The Target variable is slightly imbalance with the majority of loans has the target equals to 0 which indicates that individuals did not had any problems in paying installments in given time.
tr0 = bar_hor(app_train, "GENDER", "Distribution of GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1, tr2 = gp('GENDER', 'Distribution of Target with Applicant Gender')
fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Gender Distribution" , "Gender, Target=1" ,"Gender, Target=0"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
iplot(fig);
In the applicant's data women have applied for a larger majority of loans which is almost the double as the men. And a larger percentage (about 10% of the total) of men had the problems in paying the loan or making installments within time as compared to women applicants (about 7%).
tr0 = bar_hor(app_train, "FAMILY_STATUS", "Distribution of FAMILY_STATUS Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1, tr2 = gp('FAMILY_STATUS', 'Distribution of Target with Applicant Family Status')
fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Family Status Distribution" , "Family Status, Target = 1" ,"Family Status, Target = 0"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);
Married people have applied for a larger number of loan applications. However, people having Civil Marriage has the highest percentage (about 10%) of loan problems and challenges.
tr1 = bar_hor(app_train, "EDUCATION", "Distribution of Applicant's Education" ,"#f975ae", w=700, lm=100, return_trace= True)
tr2 = bar_hor(app_train, "HOUSING_TYPE", "Distribution of Applicant's House Types" ,"#f975ae", w=700, lm=100, return_trace = True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Education Type', 'Applicants Housing Type' ])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=400, showlegend=False, margin=dict(l=100));
iplot(fig);
tr1, tr2 = gp('EDUCATION', 'Applicants Education Types which repayed the loan')
tr3, tr4 = gp('HOUSING_TYPE', 'Applicants Housing Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ["Applicants Education Types, Target=1", "Applicants Housing Type, Target=1"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr3, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=30));
iplot(fig);
A large number of applications (44K) are filed by people having secondary education followed by people with Higher Education with 15K applications. Applicants living in House / apartments has the highest number of loan apllications equal to 55K. While we see that the applicants with Lower Secondary education status has the highest percentage of payment related problems. Also, Applicants living in apartments or living with parents also shows the same trend.
tr1 = bar_hor(app_train, "INCOME_TYPE", "Distribution of INCOME_TYPE Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Income Type'])
fig.append_trace(tr1, 1, 2);
fig['layout'].update(height=400, showlegend=False, margin=dict(l=100));
iplot(fig);
The income type of people who applies for loan include about 8 categroes, top ones are :
t = app_train['CONTRACT_TYPE'].value_counts()
labels = t.index
values = t.values
colors = ['#FEBFB3','#96D38C']
trace = go.Pie(labels=labels, values=values,
hoverinfo='all', textinfo='none',
textfont=dict(size=12),
marker=dict(colors=colors,
line=dict(color='#fff', width=2)))
layout = go.Layout(title='Applicants Contract Type', height=400)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
Cash loans with about 56K loans contributes to a majorty of total lonas in this dataset. Revolving loans has significantly lesser number equal to about 5K as compared to Cash loans.
tr1 = bar_hor(app_train, "ORGANIZATION_TYPE", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace= True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Organization Type'])
fig.append_trace(tr1, 1, 1);
fig['layout'].update(height=600, showlegend=False, margin=dict(l=150));
iplot(fig);
tr1, tr2 = gp('ORGANIZATION_TYPE', 'Applicants Income Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ["Applicants Organization Types - Repayed"])
fig.append_trace(tr1, 1, 1);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);
plt.figure(figsize=(12,5))
plt.title("Distribution of APPROVED_CREDIT")
ax = sns.distplot(app_train["APPROVED_CREDIT"])
plt.figure(figsize=(12,5))
plt.title("Distribution of ANNUITY")
ax = sns.distplot(app_train["ANNUITY"].dropna())
plt.figure(figsize=(12,5))
plt.title("Distribution of PRICE")
ax = sns.distplot(app_train["PRICE"].dropna())
plt.figure(figsize=(12,5))
plt.title("Distribution of DAYS_AGE")
ax = sns.distplot(app_train["DAYS_AGE"])
plt.figure(figsize=(12,5))
plt.title("Distribution of DAYS_WORK")
ax = sns.distplot(app_train["DAYS_WORK"])
plt.figure(figsize=(12,5))
plt.title("Distribution of DAYS_REGISTRATION")
ax = sns.distplot(app_train["DAYS_REGISTRATION"])
t = app_train["NUM_CHILDREN"].value_counts()
t1 = pd.DataFrame()
t1['x'] = t.index
t1['y'] = t.values
plt.figure(figsize=(12,5));
plt.title("Distribution of Applicant's Number of Children");
ax = sns.barplot(data=t1, x="x", y="y", color="#f975ae");
ax.spines['right'].set_visible(False);
ax.spines['top'].set_visible(False);
ax.set_ylabel('');
ax.set_xlabel('');
t = prev_app['CONTRACT_STATUS'].value_counts()
labels = t.index
values = t.values
colors = ['#96D38C', '#E1396C', '#FEBFB3', '#D0F9B1']
trace = go.Pie(labels=labels, values=values,
hoverinfo='all', textinfo='none',
textfont=dict(size=12),
marker=dict(colors=colors,
line=dict(color='#fff', width=2)))
layout = go.Layout(title='Name Contract Status in Previous Applications', height=400)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
A large number of people (about 63%) had their previous applications approved, while about 18% of them had cancelled and other 17% were refused.
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# read the test files
app_test = pd.read_csv('C:/Users/Romansya/Home Credit Risk/input/app_test.csv')
app_test['is_test'] = 1
app_test['is_train'] = 0
app_train['is_test'] = 0
app_train['is_train'] = 1
# target variable
Y = app_train['TARGET']
train_X = app_train.drop(['TARGET'], axis = 1)
# test ID
test_id = app_test['LN_ID']
test_X = app_test
# merge train and test datasets for preprocessing
data = pd.concat([train_X, test_X], axis=0, sort=True)
# function to obtain Categorical Features
def _get_categorical_features(df):
feats = [col for col in list(df.columns) if df[col].dtype == 'object']
return feats
# function to factorize categorical features
def _factorize_categoricals(df, cats):
for col in cats:
df[col], _ = pd.factorize(df[col])
return df
# function to create dummy variables of categorical features
def _get_dummies(df, cats):
for col in cats:
df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
return df
# get categorical features
data_cats = _get_categorical_features(data)
prev_app_cats = _get_categorical_features(prev_app)
# create additional dummy features -
prev_app = _get_dummies(prev_app, prev_app_cats)
# factorize the categorical features from train and test data
data = _factorize_categoricals(data, data_cats)
# count the number of previous applications for a given ID
prev_apps_count = prev_app[['LN_ID', 'SK_ID_PREV']].groupby('LN_ID').count()
prev_app['SK_ID_PREV'] = prev_app['LN_ID'].map(prev_apps_count['SK_ID_PREV'])
# Average values for all other features in previous applications
prev_apps_avg = prev_app.groupby('LN_ID').mean()
prev_apps_avg.columns = ['p_' + col for col in prev_apps_avg.columns]
data = data.merge(right=prev_apps_avg.reset_index(), how='left', on='LN_ID')
## count the number of previous installments
cnt_inst = installment_payment[['LN_ID', 'SK_ID_PREV']].groupby('LN_ID').count()
installment_payment['SK_ID_PREV'] = installment_payment['LN_ID'].map(cnt_inst['SK_ID_PREV'])
## Average values for all other variables in installments payments
avg_inst = installment_payment.groupby('LN_ID').mean()
avg_inst.columns = ['i_' + f_ for f_ in avg_inst.columns]
data = data.merge(right=avg_inst.reset_index(), how='left', on='LN_ID')
ignore_features = ['LN_ID', 'is_train', 'is_test']
relevant_features = [col for col in data.columns if col not in ignore_features]
trainX = data[data['is_train'] == 1][relevant_features]
testX = data[data['is_test'] == 1][relevant_features]
x_train, x_val, y_train, y_val = train_test_split(trainX, Y, test_size=0.2, random_state=18)
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_eval = lgb.Dataset(data=x_val, label=y_val)
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc',
'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1,
'min_split_gain':.01, 'min_child_weight':1}
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)
lgb.plot_importance(model, figsize=(12, 25), max_num_features=100);
pred = model.predict(testX)
sub_lgb = pd.DataFrame()
sub_lgb['LN_ID'] = test_id
sub_lgb['TARGET'] = pred
sub_lgb.to_csv("hc_try_lgb_baseline.csv", index=False)
sub_lgb.head()
sub_lgb.head()
thresh = 0.5
sub_lgb['PREDICTED_TARGET'] = (sub_lgb.TARGET >= 0.5).astype('int')
sub_lgb.head()
sub_lgb['REAL_TARGET'] = app_test['TARGET']
sub_lgb.head()
from sklearn.metrics import confusion_matrix
confusion_matrix(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
def find_TP(y_true, y_pred):
# counts the number of true positives (y_true = 1, y_pred = 1)
return sum((y_true == 1) & (y_pred == 1))
def find_FN(y_true, y_pred):
# counts the number of false negatives (y_true = 1, y_pred = 0)
return sum((y_true == 1) & (y_pred == 0))
def find_FP(y_true, y_pred):
# counts the number of false positives (y_true = 0, y_pred = 1)
return sum((y_true == 0) & (y_pred == 1))
def find_TN(y_true, y_pred):
# counts the number of true negatives (y_true = 0, y_pred = 0)
return sum((y_true == 0) & (y_pred == 0))
print('TP:',find_TP(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
print('FN:',find_FN(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
print('FP:',find_FP(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
print('TN:',find_TN(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
import numpy as np
def find_conf_matrix_values(y_true,y_pred):
# calculate TP, FN, FP, TN
TP = find_TP(y_true,y_pred)
FN = find_FN(y_true,y_pred)
FP = find_FP(y_true,y_pred)
TN = find_TN(y_true,y_pred)
return TP,FN,FP,TN
def my_confusion_matrix(y_true, y_pred):
TP,FN,FP,TN = find_conf_matrix_values(y_true,y_pred)
return np.array([[TN,FP],[FN,TP]])
my_confusion_matrix(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
from sklearn.metrics import accuracy_score
accuracy_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
from sklearn.metrics import recall_score
recall_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
from sklearn.metrics import precision_score
precision_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
from sklearn.metrics import f1_score
f1_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)